Create an interactive report

When you finished with the notebook, then convert it to html!

jupyter-nbconvert --execute worksheet-interactive.ipynb

You can find further interactive tools on the pyviz site:

Note:

I had to use the jupyter-nbconvert --execute worksheet-interactive.ipynb --ExecutePreprocessor.timeout=180 command to prevent timeout due to long computations.

I've chosen Chicago city's "Crimes - 2001 to present" dataset.

In [1]:
import os
import random
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
In [2]:
data_dir = './/data//'
data_file = data_dir + os.listdir(data_dir)[-1]
In [3]:
crime_data = pd.read_csv(data_file)
C:\ProgramData\Miniconda3\lib\site-packages\IPython\core\interactiveshell.py:3063: DtypeWarning: Columns (21) have mixed types.Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
In [4]:
crime_data.head()
Out[4]:
ID Case Number Date Block IUCR Primary Type Description Location Description Arrest Domestic ... Ward Community Area FBI Code X Coordinate Y Coordinate Year Updated On Latitude Longitude Location
0 11034701 JA366925 01/01/2001 11:00:00 AM 016XX E 86TH PL 1153 DECEPTIVE PRACTICE FINANCIAL IDENTITY THEFT OVER $ 300 RESIDENCE False False ... 8.0 45.0 11 NaN NaN 2001 08/05/2017 03:50:08 PM NaN NaN NaN
1 11227287 JB147188 10/08/2017 03:00:00 AM 092XX S RACINE AVE 0281 CRIM SEXUAL ASSAULT NON-AGGRAVATED RESIDENCE False False ... 21.0 73.0 02 NaN NaN 2017 02/11/2018 03:57:41 PM NaN NaN NaN
2 11227583 JB147595 03/28/2017 02:00:00 PM 026XX W 79TH ST 0620 BURGLARY UNLAWFUL ENTRY OTHER False False ... 18.0 70.0 05 NaN NaN 2017 02/11/2018 03:57:41 PM NaN NaN NaN
3 11227293 JB147230 09/09/2017 08:17:00 PM 060XX S EBERHART AVE 0810 THEFT OVER $500 RESIDENCE False False ... 20.0 42.0 06 NaN NaN 2017 02/11/2018 03:57:41 PM NaN NaN NaN
4 11227634 JB147599 08/26/2017 10:00:00 AM 001XX W RANDOLPH ST 0281 CRIM SEXUAL ASSAULT NON-AGGRAVATED HOTEL/MOTEL False False ... 42.0 32.0 02 NaN NaN 2017 02/11/2018 03:57:41 PM NaN NaN NaN

5 rows × 22 columns

In [5]:
X = crime_data['Longitude']
Y = crime_data['Latitude']

# Clear NaN values
X = X[~np.isnan(X)]
Y = Y[~np.isnan(Y)]

print('There are {0}/{1} entries ({2:.3f}% of all entries) without coordinates.'.format(len(crime_data) - len(X),
                                                                                        len(crime_data),
                                                                                        (1 - len(X)/len(crime_data))*100))
There are 68361/7112256 entries (0.961% of all entries) without coordinates.

1. Some basic maps using geoviews, datashader and geopandas

In [6]:
from functools import partial

import colorcet as cc
import datashader as ds
import datashader.transfer_functions as tf
from datashader.colors import colormap_select

import geoviews as gv
from geoviews import opts

import geopandas as gpd
import cartopy.crs as ccrs
from holoviews.operation.datashader import datashade

gv.extension('bokeh')
In [7]:
cmap = partial(colormap_select)
In [8]:
fs = 1
cvs = ds.Canvas(plot_width=fs*600, plot_height=fs*600)
agg = cvs.points(crime_data, 'Longitude', 'Latitude')
ds.transfer_functions.Image.border=0
img = tf.set_background(tf.shade(agg, cmap=cmap(cc.fire, 0.2), how='log'), 'black')
In [9]:
img
Out[9]:

The plot indicates, there is a faulty row (or rows) in the database where the coordinates are incorrect. We can easily filter these entries out.

In [10]:
# Delete incorrect entries from crime_data and update the original
# Cutoff latitude was aquired by plotting the data on a temporary
# scatter plot
cutoff_lat = 41.6
crime_coords = crime_data[crime_data['Latitude']>cutoff_lat][['Latitude', 'Longitude']]
In [11]:
# Determine correct scale of figure to create
# a figure with correct proportions
x_max = np.max(crime_coords['Longitude'])
x_min = np.min(crime_coords['Longitude'])
y_max = np.max(crime_coords['Latitude'])
y_min = np.min(crime_coords['Latitude'])

x_per_y = (x_max - x_min) / (y_max - y_min)

Visualization using geoviews, datashader and geopandas

In [12]:
districts = gpd.read_file('./data/chicago_districts.shp',
                         encoding='utf8')
districts['area_km2'] = districts['shape_area'] / 1000 / 1000
districts.head()
Out[12]:
area area_num_1 area_numbe comarea comarea_id community perimeter shape_area shape_len geometry area_km2
0 0.0 35 35 0.0 0.0 DOUGLAS 0.0 4.600462e+07 31027.054510 POLYGON ((-87.60914 41.84469, -87.60915 41.844... 46.004621
1 0.0 36 36 0.0 0.0 OAKLAND 0.0 1.691396e+07 19565.506153 POLYGON ((-87.59215 41.81693, -87.59231 41.816... 16.913961
2 0.0 37 37 0.0 0.0 FULLER PARK 0.0 1.991670e+07 25339.089750 POLYGON ((-87.62880 41.80189, -87.62879 41.801... 19.916705
3 0.0 38 38 0.0 0.0 GRAND BOULEVARD 0.0 4.849250e+07 28196.837157 POLYGON ((-87.60671 41.81681, -87.60670 41.816... 48.492503
4 0.0 39 39 0.0 0.0 KENWOOD 0.0 2.907174e+07 23325.167906 POLYGON ((-87.59215 41.81693, -87.59215 41.816... 29.071742
In [13]:
# Load districts as Polygons and Locations as Points and Overlay them on map
polys = gv.Polygons(data=districts, vdims=['community', 'area_km2'], crs=ccrs.PlateCarree())
points = gv.Points(data=crime_coords, kdims=['Longitude', 'Latitude'], crs=ccrs.PlateCarree())
In [14]:
plot = gv.tile_sources.CartoDark()\
    * datashade(points, expand=False, height=2000, width=2000,
                cmap=cc.fire, normalization='eq_hist')\
    * polys.opts(alpha=0.1, color='white', tools=['hover'])
plot.opts(width=int(1000*x_per_y), height=1000, bgcolor='black')
Out[14]:
In [15]:
gv.save(plot, './out/chicago_crime_map.html')

Pure datashader visualization

In [16]:
fs = 5
cvs = ds.Canvas(plot_width=int(fs*200*x_per_y), plot_height=int(fs*200))
agg = cvs.points(crime_coords, 'Longitude', 'Latitude')
ds.transfer_functions.Image.border=0
img = tf.set_background(tf.shade(agg, cmap=cmap(cc.fire, 0.4), how='log'), 'black')
In [17]:
img
Out[17]:

Explore columns

In [18]:
crime_data.columns
Out[18]:
Index(['ID', 'Case Number', 'Date', 'Block', 'IUCR', 'Primary Type',
       'Description', 'Location Description', 'Arrest', 'Domestic', 'Beat',
       'District', 'Ward', 'Community Area', 'FBI Code', 'X Coordinate',
       'Y Coordinate', 'Year', 'Updated On', 'Latitude', 'Longitude',
       'Location'],
      dtype='object')
In [19]:
def count_uniques_in_column(column):
    
    # Count number of 
    column_N = crime_data[column].value_counts()
    
    column_index = list(column_N.index)
    column_count = list(column_N.values)
    column_N_df = pd.DataFrame(data={column: column_index, 'Count': column_count})
    
    return column_index, column_N, column_N_df
In [20]:
locations_index, locations_N, locations_N_df = count_uniques_in_column(column='Location Description')
In [21]:
locations_N[-80:-60]
Out[21]:
CHA PARKING LOT                                          44
CHA GROUNDS                                              43
CHA HALLWAY / STAIRWELL / ELEVATOR                       38
CHA HALLWAY                                              35
TAVERN                                                   34
BASEMENT                                                 31
FACTORY / MANUFACTURING BUILDING                         23
VEHICLE - OTHER RIDE SHARE SERVICE (LYFT, UBER, ETC.)    23
HOTEL                                                    22
OTHER RAILROAD PROPERTY / TRAIN DEPOT                    22
VEHICLE - COMMERCIAL                                     22
CTA PARKING LOT / GARAGE / OTHER PROPERTY                22
STAIRWELL                                                20
VESTIBULE                                                19
DRIVEWAY                                                 17
SCHOOL - PRIVATE GROUNDS                                 16
CLUB                                                     16
BARBER SHOP/BEAUTY SALON                                 15
OFFICE                                                   15
COLLEGE / UNIVERSITY - GROUNDS                           14
Name: Location Description, dtype: int64
In [22]:
crime_data[crime_data['Location Description'] == 'BASEMENT'].head()
Out[22]:
ID Case Number Date Block IUCR Primary Type Description Location Description Arrest Domestic ... Ward Community Area FBI Code X Coordinate Y Coordinate Year Updated On Latitude Longitude Location
1250658 1146 G617712 10/14/2001 06:15:00 PM 092XX S BLACKSTONE AVE 0110 HOMICIDE FIRST DEGREE MURDER BASEMENT True False ... 8.0 48.0 01A 1187726.0 1844111.0 2001 03/19/2019 04:11:22 PM 41.727329 -87.587921 (41.727329033, -87.58792067)
1407691 1263 G723360 12/02/2001 07:00:00 PM 023XX N CAMPBELL AVE 0110 HOMICIDE FIRST DEGREE MURDER BASEMENT True False ... 1.0 22.0 01A 1159499.0 1915451.0 2001 03/19/2019 04:11:22 PM 41.923719 -87.689362 (41.923718707, -87.689361689)
1442771 1363 HH205535 02/24/2002 01:40:00 AM 038XX W POLK ST. 0110 HOMICIDE FIRST DEGREE MURDER BASEMENT True False ... NaN NaN 01A 1150931.0 1896092.0 2002 03/19/2019 04:11:22 PM 41.870768 -87.721351 (41.870767914, -87.72135136)
1744210 1747 HH638914 09/10/2002 05:35:00 PM 033XX N MONTICELLO AVE 0110 HOMICIDE FIRST DEGREE MURDER BASEMENT False False ... 35.0 21.0 01A 1151466.0 1922006.0 2002 03/18/2019 04:08:09 PM 41.941868 -87.718705 (41.941867914, -87.718705481)
1744967 1748 HH639739 09/10/2002 09:35:00 PM 023XX W MELROSE ST 0110 HOMICIDE FIRST DEGREE MURDER BASEMENT True False ... 32.0 5.0 01A 1160158.0 1921527.0 2002 03/18/2019 04:08:09 PM 41.940378 -87.686772 (41.940378045, -87.686771907)

5 rows × 22 columns

In [23]:
crimes_index, crimes_N, crimes_N_df = count_uniques_in_column(column='Primary Type')
In [24]:
crimes_N
Out[24]:
THEFT                                1504270
BATTERY                              1300995
CRIMINAL DAMAGE                       808774
NARCOTICS                             730706
ASSAULT                               447112
OTHER OFFENSE                         442109
BURGLARY                              401592
MOTOR VEHICLE THEFT                   327140
DECEPTIVE PRACTICE                    290757
ROBBERY                               267217
CRIMINAL TRESPASS                     202785
WEAPONS VIOLATION                      79580
PROSTITUTION                           69181
PUBLIC PEACE VIOLATION                 49805
OFFENSE INVOLVING CHILDREN             48954
CRIM SEXUAL ASSAULT                    28847
SEX OFFENSE                            27033
INTERFERENCE WITH PUBLIC OFFICER       17180
GAMBLING                               14575
LIQUOR LAW VIOLATION                   14361
ARSON                                  11697
HOMICIDE                               10203
KIDNAPPING                              6895
INTIMIDATION                            4156
STALKING                                3684
CRIMINAL SEXUAL ASSAULT                  800
OBSCENITY                                661
CONCEALED CARRY LICENSE VIOLATION        566
PUBLIC INDECENCY                         178
NON-CRIMINAL                             172
OTHER NARCOTIC VIOLATION                 137
HUMAN TRAFFICKING                         63
NON - CRIMINAL                            38
RITUALISM                                 23
NON-CRIMINAL (SUBJECT SPECIFIED)           9
DOMESTIC VIOLENCE                          1
Name: Primary Type, dtype: int64
In [25]:
crime_data[crime_data['Primary Type'] == 'RITUALISM'].head()
Out[25]:
ID Case Number Date Block IUCR Primary Type Description Location Description Arrest Domestic ... Ward Community Area FBI Code X Coordinate Y Coordinate Year Updated On Latitude Longitude Location
38640 5014191 HL549619 08/14/2005 10:00:00 PM 001XX W 124TH ST 0510 RITUALISM AGG RIT MUT: HANDS/FIST/FEET SERIOUS INJURY RESIDENCE False False ... 9.0 53.0 04B NaN NaN 2005 08/17/2015 03:03:40 PM NaN NaN NaN
1413400 1322455 G013191 01/07/2001 02:15:00 AM 030XX N KILPATRICK AV 0492 RITUALISM AGG RITUAL MUT:KNIFE/CUTTING I RESIDENCE False True ... NaN NaN 04B 1144471.0 1919853.0 2001 08/17/2015 03:03:40 PM 41.936095 -87.744470 (41.936094634, -87.744469685)
1424221 1338253 G029580 01/14/2001 04:15:00 PM 036XX W AUGUSTA BV 0493 RITUALISM AGG RITUAL MUT:OTH DANG WEAPON SIDEWALK False False ... NaN NaN 04B 1151847.0 1906382.0 2001 08/17/2015 03:03:40 PM 41.898987 -87.717717 (41.898986783, -87.717717372)
1474577 1404636 G120817 03/02/2001 02:18:51 AM 001XX W 113 ST 0492 RITUALISM AGG RITUAL MUT:KNIFE/CUTTING I RESIDENCE False False ... NaN NaN 04B 1177318.0 1830063.0 2001 08/17/2015 03:03:40 PM 41.689020 -87.626469 (41.689020374, -87.626468818)
1508955 1448541 G172857 03/26/2001 09:28:28 PM 037XX N RACINE AV 0493 RITUALISM AGG RITUAL MUT:OTH DANG WEAPON STREET False False ... NaN NaN 04B 1167590.0 1924742.0 2001 08/17/2015 03:03:40 PM 41.949043 -87.659364 (41.94904308, -87.659363981)

5 rows × 22 columns

2. plotly piechart of crime types

In [26]:
import plotly.express as px
In [27]:
fig = px.pie(crimes_N_df[(crimes_N_df.Count>20)&\
                         (crimes_N_df['Primary Type'] != 'NON-CRIMINAL')&\
                         (crimes_N_df['Primary Type'] != 'NON - CRIMINAL')],
             values='Count', names='Primary Type',
             title='Distribution of crimes by type',
             width=1200, height=700)
fig.update_traces(textposition='auto')
fig.update_layout(legend=dict(x=1, y=1.2))
fig.show()
fig.write_html('./out/crime_types_piechart.html')

3. plotly barplot of crime frequency

In [28]:
from datetime import datetime
In [29]:
dates = crime_data['Date']
In [30]:
# Convert Date strings to datetime-type values to handle them more easily
# Takes ~40 seconds
dates_tmp = []
for d in dates:
    dates_tmp.append(datetime.strptime(d, '%m/%d/%Y %I:%M:%S %p'))
dates_datetime = pd.Series(np.array(dates_tmp), index=dates.index)
In [31]:
# Collect years, months and days individually by maintaining
# the original order of entries
ysmsds = np.zeros((len(dates),3))
for idx, d in enumerate(dates_datetime):
    ysmsds[idx] = [d.year, d.month, d.day]
In [32]:
# Years, Months, Days
ys = ysmsds[:,0]
ms = ysmsds[:,1]
ds = ysmsds[:,2]

ys_unique = np.unique(ys)
ms_unique = np.unique(ms)
ds_unique = np.unique(ds)
In [33]:
# Count crimes per years, per months and per days
crime_per_y = pd.DataFrame(data={str(int(y)) : len(np.where(ys == y)[0]) for y in ys_unique}.items(),
                           columns=['Date', 'Crime Count'])
crime_per_m = pd.DataFrame(data={str(int(m)) : len(np.where(ms == m)[0]) for m in ms_unique}.items(),
                           columns=['Date', 'Crime Count'])
crime_per_d = pd.DataFrame(data={str(int(d)) : len(np.where(ds == d)[0]) for d in ds_unique}.items(),
                           columns=['Date', 'Crime Count'])

a) most dangerous years

In [34]:
fig = px.bar(crime_per_y, x='Date', y='Crime Count',
             hover_data=['Date', 'Crime Count'], color='Crime Count',
             labels={'Date' : 'Year',
                     'Crime Count':'Number of crimes, commited'}, height=600)
fig.show()
fig.write_html('./out/crime_freqs_y.html')

b) most dangerous months

In [35]:
fig = px.bar(crime_per_m, x='Date', y='Crime Count',
             hover_data=['Date', 'Crime Count'], color='Crime Count',
             labels={'Date' : 'Month',
                     'Crime Count':'Number of crimes, commited'},
             height=600)
fig.show()
fig.write_html('./out/crime_freqs_m.html')

a) most dangerous days

In [36]:
fig = px.bar(crime_per_d, x='Date', y='Crime Count',
             hover_data=['Date', 'Crime Count'], color='Crime Count',
             labels={'Date' : 'Day',
                     'Crime Count':'Number of crimes, commited'}, height=600)
fig.show()
fig.write_html('./out/crime_freqs_d.html')

4. bokeh jitter plot of crimes' frequency on weekdays

In [37]:
from bokeh.io import output_file, output_notebook, show

from bokeh.plotting import figure
from bokeh.transform import jitter, factor_cmap
from bokeh.models import ColumnDataSource
In [38]:
output_notebook()
Loading BokehJS ...
In [39]:
day_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
In [40]:
dict_weekdays = [day_names[d.weekday()] for d in dates_datetime]
dict_time = [d.time() for d in dates_datetime]
In [41]:
crimes_datetime = pd.DataFrame(data={'Weekday' : dict_weekdays, 'Time' : dict_time},
                               index=dates_datetime)
crimes_datetime.index.name = 'datetime'
In [42]:
output_file('./out/crime_freqs_weekday.html')

source = ColumnDataSource(crimes_datetime.sample(frac=0.005))

p = figure(plot_width=1000, plot_height=400, y_range=day_names, x_axis_type='datetime',
           title='Crimes committed by Time of Day (US) in Chicago (2001—2020)')

p.circle(x='Time', y=jitter('Weekday', width=0.6, range=p.y_range),  source=source, alpha=0.3)

p.xaxis[0].formatter.days = ['%Hh']
p.x_range.range_padding = 0
p.ygrid.grid_line_color = None

show(p)

5. bokeh barplot of crime types

In [43]:
# Gusztustalan spagetti kód
def get_hex_colors(data, cmap=cm.jet):
    
    rgba_colors = [np.array(cmap(i)) for i in np.linspace(0,1,len(data))[::-1]]
    rgb_255_colors = [(c[:3] * 255).astype('int') for c in rgba_colors]
    hex_colors = tuple(['#%02x%02x%02x' % tuple(c) for c in rgb_255_colors])
    
    return hex_colors
In [44]:
# Output HTML file of figure
output_file('./out/crime_per_years.html')

# Source of data
source = ColumnDataSource(data=crimes_N_df)

# Define figure
p = figure(x_range=crimes_N_df['Primary Type'],
           plot_width=1200, plot_height=800,
           toolbar_location='right', title='Distribution of crimes by type')
p.vbar(x='Primary Type', top='Count', width=0.9, source=source,
       line_color='white', fill_color=factor_cmap('Primary Type',
                                                  palette=get_hex_colors(crimes_N_df['Primary Type']),
                                                  factors=crimes_N_df['Primary Type']))

p.xaxis.major_label_orientation = 42/180 * np.pi

p.xgrid.grid_line_color = None
#p.legend.label_text_font_size = '6pt'
#p.legend.orientation = 'vertical'
#p.legend.location = (0, -60)

show(p)
In [ ]: